{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-28T00:48:58.396502Z",
     "start_time": "2019-06-28T00:48:58.392545Z"
    }
   },
   "source": [
    "<br/>\n",
    "<br/>\n",
    "\n",
    "# Part 4: Ensembling Is All You Need -- Blending To Achieve Higher Score \n",
    "<br/>\n",
    "<br/>\n",
    "Ensemble is a very powerful machine learning techique, we did't spend a lot of time to do this, but simply blending all of the model can give us great boost.\n",
    "\n",
    "What's more variety is very important.\n",
    "\n",
    "We have 5 good models:\n",
    "\n",
    "Bert Large------------0.9430<br/>\n",
    "Bert Base 1----------0.9415<br/>\n",
    "Bert Base 2----------0.9413<br/>\n",
    "GTP2------------------0.9387<br/>\n",
    "Ensemble LSTM----0.9369<br/>\n",
    "\n",
    "Although LSTM and GTP2 have relatively low score, but blending them to gether with Berts can give a great boost.\n",
    "\n",
    "What we did is:<br/>\n",
    "**Finall =  0.350*Bert_Large + 0.175*Bert_Base_1 + 0.175*Bert_Base_2 + 0.170*GPT2 + 0.130*LSTM** <br/>\n",
    "which improve the single best **0.9430 --> 0.9446**\n",
    "\n",
    "It's worth to mention that the **weight average of the score is 0.9409**, but simply blending the models can **lower the variance and gives us 0.9446**."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Blending Pipline\n",
    "<br/>\n",
    "Here's a pipline that load all models and make predictions. Nothing special."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "%matplotlib inline\n",
    "import numpy as np\n",
    "from pprint import pprint\n",
    "import pandas as pd\n",
    "import os\n",
    "import time\n",
    "import pickle\n",
    "import gc\n",
    "import random\n",
    "from tqdm._tqdm_notebook import tqdm_notebook as tqdm\n",
    "from keras.preprocessing import text, sequence\n",
    "import torch\n",
    "from torch import nn\n",
    "from torch.utils import data\n",
    "from torch.nn import functional as F\n",
    "device = torch.device('cuda')\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ensemble  LSTM Prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "CRAWL_EMBEDDING_PATH = '../input/lstm-word-embedding/crawl-300d-2M.pkl'\n",
    "GLOVE_EMBEDDING_PATH = '../input/lstm-word-embedding/glove.840B.300d.pkl'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "NUM_MODELS = 2\n",
    "LSTM_UNITS = 128\n",
    "DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS\n",
    "MAX_LEN = 300\n",
    "\n",
    "def get_coefs(word, *arr):\n",
    "    return word, np.asarray(arr, dtype='float32')\n",
    "\n",
    "\n",
    "def load_embeddings(path):\n",
    "    with open(path,'rb') as f:\n",
    "        emb_arr = pickle.load(f)\n",
    "    return emb_arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_matrix(word_index, path):\n",
    "    embedding_index = load_embeddings(path)\n",
    "    embedding_matrix = np.zeros((max_features + 1, 300))\n",
    "    unknown_words = []\n",
    "    \n",
    "    for word, i in word_index.items():\n",
    "        if i <= max_features:\n",
    "            try:\n",
    "                embedding_matrix[i] = embedding_index[word]\n",
    "            except KeyError:\n",
    "                try:\n",
    "                    embedding_matrix[i] = embedding_index[word.lower()]\n",
    "                except KeyError:\n",
    "                    try:\n",
    "                        embedding_matrix[i] = embedding_index[word.title()]\n",
    "                    except KeyError:\n",
    "                        unknown_words.append(word)\n",
    "    return embedding_matrix, unknown_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sigmoid(x):\n",
    "    return 1 / (1 + np.exp(-x))\n",
    "\n",
    "class SpatialDropout(nn.Dropout2d):\n",
    "    def forward(self, x):\n",
    "        x = x.unsqueeze(2)    # (N, T, 1, K)\n",
    "        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)\n",
    "        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked\n",
    "        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)\n",
    "        x = x.squeeze(2)  # (N, T, K)\n",
    "        return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "symbols_to_isolate = '.,?!-;*\"…:—()%#$&_/@＼・ω+=”“[]^–>\\\\°<~•≠™ˈʊɒ∞§{}·τα❤☺ɡ|¢→̶`❥━┣┫┗Ｏ►★©―ɪ✔®\\x96\\x92●£♥➤´¹☕≈÷♡◐║▬′ɔː€۩۞†μ✒➥═☆ˌ◄½ʻπδηλσερνʃ✬ＳＵＰＥＲＩＴ☻±♍µº¾✓◾؟．⬅℅»Вав❣⋅¿¬♫ＣＭβ█▓▒░⇒⭐›¡₂₃❧▰▔◞▀▂▃▄▅▆▇↙γ̄″☹➡«φ⅓„✋：¥̲̅́∙‛◇✏▷❓❗¶˚˙）сиʿ✨。ɑ\\x80◕！％¯−ﬂﬁ₁²ʌ¼⁴⁄₄⌠♭✘╪▶☭✭♪☔☠♂☃☎✈✌✰❆☙○‣⚓年∎ℒ▪▙☏⅛ｃａｓǀ℮¸ｗ‚∼‖ℳ❄←☼⋆ʒ⊂、⅔¨͡๏⚾⚽Φ×θ￦？（℃⏩☮⚠月✊❌⭕▸■⇌☐☑⚡☄ǫ╭∩╮，例＞ʕɐ̣Δ₀✞┈╱╲▏▕┃╰▊▋╯┳┊≥☒↑☝ɹ✅☛♩☞ＡＪＢ◔◡↓♀⬆̱ℏ\\x91⠀ˤ╚↺⇤∏✾◦♬³の｜／∵∴√Ω¤☜▲↳▫‿⬇✧ｏｖｍ－２０８＇‰≤∕ˆ⚜☁'\n",
    "symbols_to_delete = '\\n🍕\\r🐵😑\\xa0\\ue014\\t\\uf818\\uf04a\\xad😢🐶️\\uf0e0😜😎👊\\u200b\\u200e😁عدويهصقأناخلىبمغر😍💖💵Е👎😀😂\\u202a\\u202c🔥😄🏻💥ᴍʏʀᴇɴᴅᴏᴀᴋʜᴜʟᴛᴄᴘʙғᴊᴡɢ😋👏שלוםבי😱‼\\x81エンジ故障\\u2009🚌ᴵ͞🌟😊😳😧🙀😐😕\\u200f👍😮😃😘אעכח💩💯⛽🚄🏼ஜ😖ᴠ🚲‐😟😈💪🙏🎯🌹😇💔😡\\x7f👌ἐὶήιὲκἀίῃἴξ🙄Ｈ😠\\ufeff\\u2028😉😤⛺🙂\\u3000تحكسة👮💙فزط😏🍾🎉😞\\u2008🏾😅😭👻😥😔😓🏽🎆🍻🍽🎶🌺🤔😪\\x08‑🐰🐇🐱🙆😨🙃💕𝘊𝘦𝘳𝘢𝘵𝘰𝘤𝘺𝘴𝘪𝘧𝘮𝘣💗💚地獄谷улкнПоАН🐾🐕😆ה🔗🚽歌舞伎🙈😴🏿🤗🇺🇸мυтѕ⤵🏆🎃😩\\u200a🌠🐟💫💰💎эпрд\\x95🖐🙅⛲🍰🤐👆🙌\\u2002💛🙁👀🙊🙉\\u2004ˢᵒʳʸᴼᴷᴺʷᵗʰᵉᵘ\\x13🚬🤓\\ue602😵άοόςέὸתמדףנרךצט😒͝🆕👅👥👄🔄🔤👉👤👶👲🔛🎓\\uf0b7\\uf04c\\x9f\\x10成都😣⏺😌🤑🌏😯ех😲Ἰᾶὁ💞🚓🔔📚🏀👐\\u202d💤🍇\\ue613小土豆🏡❔⁉\\u202f👠》कर्मा🇹🇼🌸蔡英文🌞🎲レクサス😛外国人关系Сб💋💀🎄💜🤢َِьыгя不是\\x9c\\x9d🗑\\u2005💃📣👿༼つ༽😰ḷЗз▱ц￼🤣卖温哥华议会下降你失去所有的钱加拿大坏税骗子🐝ツ🎅\\x85🍺آإشء🎵🌎͟ἔ油别克🤡🤥😬🤧й\\u2003🚀🤴ʲшчИОРФДЯМюж😝🖑ὐύύ特殊作戦群щ💨圆明园קℐ🏈😺🌍⏏ệ🍔🐮🍁🍆🍑🌮🌯🤦\\u200d𝓒𝓲𝓿𝓵안영하세요ЖљКћ🍀😫🤤ῦ我出生在了可以说普通话汉语好极🎼🕺🍸🥂🗽🎇🎊🆘🤠👩🖒🚪天一家⚲\\u2006⚭⚆⬭⬯⏖新✀╌🇫🇷🇩🇪🇮🇬🇧😷🇨🇦ХШ🌐\\x1f杀鸡给猴看ʁ𝗪𝗵𝗲𝗻𝘆𝗼𝘂𝗿𝗮𝗹𝗶𝘇𝗯𝘁𝗰𝘀𝘅𝗽𝘄𝗱📺ϖ\\u2000үսᴦᎥһͺ\\u2007հ\\u2001ɩｙｅ൦ｌƽｈ𝐓𝐡𝐞𝐫𝐮𝐝𝐚𝐃𝐜𝐩𝐭𝐢𝐨𝐧Ƅᴨןᑯ໐ΤᏧ௦Іᴑ܁𝐬𝐰𝐲𝐛𝐦𝐯𝐑𝐙𝐣𝐇𝐂𝐘𝟎ԜТᗞ౦〔Ꭻ𝐳𝐔𝐱𝟔𝟓𝐅🐋ﬃ💘💓ё𝘥𝘯𝘶💐🌋🌄🌅𝙬𝙖𝙨𝙤𝙣𝙡𝙮𝙘𝙠𝙚𝙙𝙜𝙧𝙥𝙩𝙪𝙗𝙞𝙝𝙛👺🐷ℋ𝐀𝐥𝐪🚶𝙢Ἱ🤘ͦ💸ج패티Ｗ𝙇ᵻ👂👃ɜ🎫\\uf0a7БУі🚢🚂ગુજરાતીῆ🏃𝓬𝓻𝓴𝓮𝓽𝓼☘﴾̯﴿₽\\ue807𝑻𝒆𝒍𝒕𝒉𝒓𝒖𝒂𝒏𝒅𝒔𝒎𝒗𝒊👽😙\\u200cЛ‒🎾👹⎌🏒⛸公寓养宠物吗🏄🐀🚑🤷操美𝒑𝒚𝒐𝑴🤙🐒欢迎来到阿拉斯ספ𝙫🐈𝒌𝙊𝙭𝙆𝙋𝙍𝘼𝙅ﷻ🦄巨收赢得白鬼愤怒要买额ẽ🚗🐳𝟏𝐟𝟖𝟑𝟕𝒄𝟗𝐠𝙄𝙃👇锟斤拷𝗢𝟳𝟱𝟬⦁マルハニチロ株式社⛷한국어ㄸㅓ니͜ʖ𝘿𝙔₵𝒩ℯ𝒾𝓁𝒶𝓉𝓇𝓊𝓃𝓈𝓅ℴ𝒻𝒽𝓀𝓌𝒸𝓎𝙏ζ𝙟𝘃𝗺𝟮𝟭𝟯𝟲👋🦊多伦🐽🎻🎹⛓🏹🍷🦆为和中友谊祝贺与其想象对法如直接问用自己猜本传教士没积唯认识基督徒曾经让相信耶稣复活死怪他但当们聊些政治题时候战胜因圣把全堂结婚孩恐惧且栗谓这样还♾🎸🤕🤒⛑🎁批判检讨🏝🦁🙋😶쥐스탱트뤼도석유가격인상이경제황을렵게만들지않록잘관리해야합다캐나에서대마초와화약금의품런성분갈때는반드시허된사용🔫👁凸ὰ💲🗯𝙈Ἄ𝒇𝒈𝒘𝒃𝑬𝑶𝕾𝖙𝖗𝖆𝖎𝖌𝖍𝖕𝖊𝖔𝖑𝖉𝖓𝖐𝖜𝖞𝖚𝖇𝕿𝖘𝖄𝖛𝖒𝖋𝖂𝕴𝖟𝖈𝕸👑🚿💡知彼百\\uf005𝙀𝒛𝑲𝑳𝑾𝒋𝟒😦𝙒𝘾𝘽🏐𝘩𝘨ὼṑ𝑱𝑹𝑫𝑵𝑪🇰🇵👾ᓇᒧᔭᐃᐧᐦᑳᐨᓃᓂᑲᐸᑭᑎᓀᐣ🐄🎈🔨🐎🤞🐸💟🎰🌝🛳点击查版🍭𝑥𝑦𝑧ＮＧ👣\\uf020っ🏉ф💭🎥Ξ🐴👨🤳🦍\\x0b🍩𝑯𝒒😗𝟐🏂👳🍗🕉🐲چی𝑮𝗕𝗴🍒ꜥⲣⲏ🐑⏰鉄リ事件ї💊「」\\uf203\\uf09a\\uf222\\ue608\\uf202\\uf099\\uf469\\ue607\\uf410\\ue600燻製シ虚偽屁理屈Г𝑩𝑰𝒀𝑺🌤𝗳𝗜𝗙𝗦𝗧🍊ὺἈἡχῖΛ⤏🇳𝒙ψՁմեռայինրւդձ冬至ὀ𝒁🔹🤚🍎𝑷🐂💅𝘬𝘱𝘸𝘷𝘐𝘭𝘓𝘖𝘹𝘲𝘫کΒώ💢ΜΟΝΑΕ🇱♲𝝈↴💒⊘Ȼ🚴🖕🖤🥘📍👈➕🚫🎨🌑🐻𝐎𝐍𝐊𝑭🤖🎎😼🕷ｇｒｎｔｉｄｕｆｂｋ𝟰🇴🇭🇻🇲𝗞𝗭𝗘𝗤👼📉🍟🍦🌈🔭《🐊🐍\\uf10aლڡ🐦\\U0001f92f\\U0001f92a🐡💳ἱ🙇𝗸𝗟𝗠𝗷🥜さようなら🔼'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "    punct = \"/-'?!.,#$%\\'()*+-/:;<=>@[\\\\]^_`{|}~`\" + '\"\"“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\\×™√²—–&'\n",
    "    small_caps_mapping = { \n",
    "    \"ᴀ\": \"a\", \"ʙ\": \"b\", \"ᴄ\": \"c\", \"ᴅ\": \"d\", \"ᴇ\": \"e\", \"ғ\": \"f\", \"ɢ\": \"g\", \"ʜ\": \"h\", \"ɪ\": \"i\", \n",
    "    \"ᴊ\": \"j\", \"ᴋ\": \"k\", \"ʟ\": \"l\", \"ᴍ\": \"m\", \"ɴ\": \"n\", \"ᴏ\": \"o\", \"ᴘ\": \"p\", \"ǫ\": \"q\", \"ʀ\": \"r\", \n",
    "    \"s\": \"s\", \"ᴛ\": \"t\", \"ᴜ\": \"u\", \"ᴠ\": \"v\", \"ᴡ\": \"w\", \"x\": \"x\", \"ʏ\": \"y\", \"ᴢ\": \"z\"}\n",
    "    contraction_mapping = {\n",
    "    \"ain't\": \"is not\", \"aren't\": \"are not\",\"can't\": \"cannot\", \"'cause\": \"because\", \"could've\": \"could have\", \"couldn't\": \"could not\", \n",
    "    \"didn't\": \"did not\",  \"doesn't\": \"does not\", \"don't\": \"do not\", \"hadn't\": \"had not\", \"hasn't\": \"has not\", \"haven't\": \"have not\", \n",
    "    \"he'd\": \"he would\",\"he'll\": \"he will\", \"he's\": \"he is\", \"how'd\": \"how did\", \"how'd'y\": \"how do you\", \"how'll\": \"how will\", \"how's\": \"how is\",  \n",
    "    \"I'd\": \"I would\", \"I'd've\": \"I would have\", \"I'll\": \"I will\", \"I'll've\": \"I will have\",\"I'm\": \"I am\", \"I've\": \"I have\", \"i'd\": \"i would\", \"i'd've\": \n",
    "    \"i would have\", \"i'll\": \"i will\",  \"i'll've\": \"i will have\",\"i'm\": \"i am\", \"i've\": \"i have\", \"isn't\": \"is not\", \"it'd\": \"it would\", \n",
    "    \"it'd've\": \"it would have\", \"it'll\": \"it will\", \"it'll've\": \"it will have\",\"it's\": \"it is\", \"let's\": \"let us\", \"ma'am\": \"madam\", \n",
    "    \"mayn't\": \"may not\", \"might've\": \"might have\",\"mightn't\": \"might not\",\"mightn't've\": \"might not have\", \"must've\": \"must have\", \n",
    "    \"mustn't\": \"must not\", \"mustn't've\": \"must not have\", \"needn't\": \"need not\", \"needn't've\": \"need not have\",\n",
    "    \"o'clock\": \"of the clock\", \"oughtn't\": \"ought not\", \"oughtn't've\": \"ought not have\", \"shan't\": \"shall not\", \n",
    "    \"sha'n't\": \"shall not\", \"shan't've\": \"shall not have\", \"she'd\": \"she would\", \"she'd've\": \"she would have\", \n",
    "    \"she'll\": \"she will\", \"she'll've\": \"she will have\", \"she's\": \"she is\", \"should've\": \"should have\", \"shouldn't\": \"should not\", \n",
    "    \"shouldn't've\": \"should not have\", \"so've\": \"so have\",\"so's\": \"so as\", \"this's\":\"this is\",\"that'd\": \"that would\", \n",
    "    \"that'd've\": \"that would have\", \"that's\": \"that is\", \"there'd\": \"there would\", \"there'd've\": \"there would have\", \"there's\": \"there is\", \n",
    "    \"here's\": \"here is\",\"they'd\": \"they would\", \"they'd've\": \"they would have\", \"they'll\": \"they will\", \"they'll've\": \"they will have\", \n",
    "    \"they're\": \"they are\", \"they've\": \"they have\", \"to've\": \"to have\", \"wasn't\": \"was not\", \"we'd\": \"we would\", \"we'd've\": \"we would have\", \n",
    "    \"we'll\": \"we will\", \"we'll've\": \"we will have\", \"we're\": \"we are\", \"we've\": \"we have\", \"weren't\": \"were not\", \"what'll\": \"what will\", \n",
    "    \"what'll've\": \"what will have\", \"what're\": \"what are\",  \"what's\": \"what is\", \"what've\": \"what have\", \"when's\": \"when is\", \n",
    "    \"when've\": \"when have\", \"where'd\": \"where did\", \"where's\": \"where is\", \"where've\": \"where have\", \"who'll\": \"who will\", \"who'll've\": \"who will have\",\n",
    "    \"who's\": \"who is\", \"who've\": \"who have\", \"why's\": \"why is\", \"why've\": \"why have\", \"will've\": \"will have\", \"won't\": \"will not\", \n",
    "    \"won't've\": \"will not have\", \"would've\": \"would have\", \"wouldn't\": \"would not\", \"wouldn't've\": \"would not have\", \n",
    "    \"y'all\": \"you all\", \"y'all'd\": \"you all would\",\"y'all'd've\": \"you all would have\",\"y'all're\": \"you all are\",\"y'all've\": \"you all have\",\n",
    "    \"you'd\": \"you would\", \"you'd've\": \"you would have\", \"you'll\": \"you will\", \"you'll've\": \"you will have\", \"you're\": \"you are\", \"you've\": \"you have\",\n",
    "    \"trump's\": \"trump is\", \"obama's\": \"obama is\", \"canada's\": \"canada is\", \"today's\": \"today is\"}\n",
    "    specail_signs = { \"…\": \"...\", \"₂\": \"2\"}\n",
    "    specials = [\"’\", \"‘\", \"´\", \"`\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize.treebank import TreebankWordTokenizer\n",
    "tokenizer = TreebankWordTokenizer()\n",
    "\n",
    "\n",
    "isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}\n",
    "remove_dict = {ord(c):f'' for c in symbols_to_delete}\n",
    "\n",
    "\n",
    "def handle_punctuation(x):\n",
    "    x = x.translate(remove_dict)\n",
    "    x = x.translate(isolate_dict)\n",
    "    return x\n",
    "\n",
    "def handle_contractions(x):\n",
    "    x = tokenizer.tokenize(x)\n",
    "    return x\n",
    "\n",
    "def fix_quote(x):\n",
    "    x = [x_[1:] if x_.startswith(\"'\") else x_ for x_ in x]\n",
    "    x = ' '.join(x)\n",
    "    return x\n",
    "\n",
    "def preprocess(x):\n",
    "    x = handle_punctuation(x)\n",
    "    x = handle_contractions(x)\n",
    "    x = fix_quote(x)\n",
    "    return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_test = test['comment_text'].apply(lambda x:preprocess(x))\n",
    "max_features = 400000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = text.Tokenizer(num_words = max_features, filters='',lower=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "n unknown words (crawl):  14055\n",
      "n unknown words (glove):  14375\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.fit_on_texts(list(x_test))\n",
    "\n",
    "crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, CRAWL_EMBEDDING_PATH)\n",
    "print('n unknown words (crawl): ', len(unknown_words_crawl))\n",
    "\n",
    "glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, GLOVE_EMBEDDING_PATH)\n",
    "print('n unknown words (glove): ', len(unknown_words_glove))\n",
    "\n",
    "max_features = max_features or len(tokenizer.word_index) + 1\n",
    "max_features\n",
    "\n",
    "embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)\n",
    "embedding_matrix.shape\n",
    "\n",
    "del crawl_matrix\n",
    "del glove_matrix\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "class NeuralNet(nn.Module):\n",
    "    def __init__(self, embedding_matrix, num_aux_targets):\n",
    "        super(NeuralNet, self).__init__()\n",
    "        embed_size = embedding_matrix.shape[1]\n",
    "        \n",
    "        self.embedding = nn.Embedding(max_features, embed_size)\n",
    "        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))\n",
    "        self.embedding.weight.requires_grad = False\n",
    "        self.embedding_dropout = SpatialDropout(0.3)\n",
    "        \n",
    "        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)\n",
    "        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)\n",
    "    \n",
    "        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)\n",
    "        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)\n",
    "        \n",
    "        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)\n",
    "        self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)\n",
    "        \n",
    "    def forward(self, x, lengths=None):\n",
    "        h_embedding = self.embedding(x.long())\n",
    "        h_embedding = self.embedding_dropout(h_embedding)\n",
    "        \n",
    "        h_lstm1, _ = self.lstm1(h_embedding)\n",
    "        h_lstm2, _ = self.lstm2(h_lstm1)\n",
    "        \n",
    "        # global average pooling\n",
    "        avg_pool = torch.mean(h_lstm2, 1)\n",
    "        # global max pooling\n",
    "        max_pool, _ = torch.max(h_lstm2, 1)\n",
    "        \n",
    "        h_conc = torch.cat((max_pool, avg_pool), 1)\n",
    "        h_conc_linear1  = F.relu(self.linear1(h_conc))\n",
    "        h_conc_linear2  = F.relu(self.linear2(h_conc))\n",
    "        \n",
    "        hidden = h_conc + h_conc_linear1 + h_conc_linear2\n",
    "        \n",
    "        result = self.linear_out(hidden)\n",
    "        aux_result = self.linear_aux_out(hidden)\n",
    "        out = torch.cat([result, aux_result], 1)\n",
    "        \n",
    "        return out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "maxlen = 230"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 512\n",
    "tokenized_test = tokenizer.texts_to_sequences(x_test)\n",
    "#test_lengths = torch.from_numpy(np.array([len(x) for x in tokenized_test]))\n",
    "x_test_padded = torch.from_numpy(sequence.pad_sequences(tokenized_test, maxlen=maxlen))\n",
    "test_dataset = data.TensorDataset(x_test_padded)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_prediction(test,model_path,batch_size=256):\n",
    "    model = NeuralNet(embedding_matrix, 6)\n",
    "    temp_dict = torch.load(model_path)\n",
    "    temp_dict['embedding.weight'] = torch.tensor(embedding_matrix)\n",
    "    model.load_state_dict(temp_dict)\n",
    "    model.cuda()\n",
    "    for param in model.parameters():\n",
    "        param.requires_grad = False\n",
    "    model.eval()\n",
    "    \n",
    "    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)\n",
    "    test_preds = np.zeros(len(test))    \n",
    "    step = 0\n",
    "    total_step = len(test_loader)\n",
    "    for i, (x_batch,) in enumerate(test_loader):\n",
    "        if step %100 == 0:\n",
    "            print('step:',step,'total_step:',total_step)\n",
    "        step += 1\n",
    "        pred = model(x_batch.to(device))\n",
    "        test_preds[i * batch_size:(i + 1) * batch_size] = pred[:, 0].detach().cpu().squeeze().numpy()\n",
    "\n",
    "    test_pred = torch.sigmoid(torch.tensor(test_preds)).numpy().ravel()\n",
    "    return test_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_path_epoch4 = ['../input/4lstm-model/model1_lstm_epoch4.bin',\n",
    "                    '../input/4lstm-model/model2_lstm_epoch4.bin',\n",
    "                    '../input/4lstm-model/model3_lstm_epoch4.bin',\n",
    "                    '../input/4lstm-model/model4_lstm_epoch4.bin']\n",
    "model_path_epoch3 = ['../input/4lstm-model/model1_lstm_epoch3.bin',\n",
    "                    '../input/4lstm-model/model2_lstm_epoch3.bin',\n",
    "                    '../input/4lstm-model/model3_lstm_epoch3.bin',\n",
    "                    '../input/4lstm-model/model4_lstm_epoch3.bin']\n",
    "model_path_epoch2 = ['../input/4lstm-model/model1_lstm_epoch2.bin',\n",
    "                    '../input/4lstm-model/model2_lstm_epoch2.bin',\n",
    "                    '../input/4lstm-model/model3_lstm_epoch2.bin',\n",
    "                    '../input/4lstm-model/model4_lstm_epoch2.bin']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step: 0 total_step: 191\n",
      "step: 100 total_step: 191\n",
      "step: 0 total_step: 191\n",
      "step: 100 total_step: 191\n",
      "step: 0 total_step: 191\n",
      "step: 100 total_step: 191\n",
      "step: 0 total_step: 191\n",
      "step: 100 total_step: 191\n"
     ]
    }
   ],
   "source": [
    "epoch4_pred = []\n",
    "for model_path in model_path_epoch4:\n",
    "    temp_test_pred = make_prediction(test_dataset, model_path, batch_size=512)\n",
    "    epoch4_pred.append(temp_test_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step: 0 total_step: 191\n",
      "step: 100 total_step: 191\n",
      "step: 0 total_step: 191\n",
      "step: 100 total_step: 191\n",
      "step: 0 total_step: 191\n",
      "step: 100 total_step: 191\n",
      "step: 0 total_step: 191\n",
      "step: 100 total_step: 191\n"
     ]
    }
   ],
   "source": [
    "epoch3_pred = []\n",
    "for model_path in model_path_epoch3:\n",
    "    temp_test_pred = make_prediction(test_dataset, model_path, batch_size=512)\n",
    "    epoch3_pred.append(temp_test_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step: 0 total_step: 191\n",
      "step: 100 total_step: 191\n",
      "step: 0 total_step: 191\n",
      "step: 100 total_step: 191\n",
      "step: 0 total_step: 191\n",
      "step: 100 total_step: 191\n",
      "step: 0 total_step: 191\n",
      "step: 100 total_step: 191\n"
     ]
    }
   ],
   "source": [
    "epoch2_pred = []\n",
    "for model_path in model_path_epoch2:\n",
    "    temp_test_pred = make_prediction(test_dataset, model_path, batch_size=512)\n",
    "    epoch2_pred.append(temp_test_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "epoch4_average = np.zeros(len(x_test))\n",
    "for pred in epoch4_pred:\n",
    "    epoch4_average += pred/4\n",
    "    \n",
    "epoch3_average = np.zeros(len(x_test))\n",
    "for pred in epoch3_pred:\n",
    "    epoch3_average += pred/4\n",
    "    \n",
    "epoch2_average = np.zeros(len(x_test))\n",
    "for pred in epoch2_pred:\n",
    "    epoch2_average += pred/4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "checkpoint_weights = [6,8,4]\n",
    "lstm_pred = np.average([epoch4_average,epoch3_average,epoch2_average], weights=checkpoint_weights, axis=0)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_lstm = pd.DataFrame.from_dict({\n",
    "    'id': test['id'],\n",
    "    'prediction': lstm_pred\n",
    "})\n",
    "\n",
    "#submission_lstm.to_csv('submission.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "del embedding_matrix\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GTP2 Prediction "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing /kaggle/input/finalpytorchgpt/our-gpt2-pytorch/pytorch-pretrained-BERT-master\r\n",
      "Requirement already satisfied: torch>=0.4.1 in /opt/conda/lib/python3.6/site-packages (from pytorch-pretrained-bert==0.6.2) (1.1.0)\r\n",
      "Requirement already satisfied: numpy in /opt/conda/lib/python3.6/site-packages (from pytorch-pretrained-bert==0.6.2) (1.16.4)\r\n",
      "Requirement already satisfied: boto3 in /opt/conda/lib/python3.6/site-packages (from pytorch-pretrained-bert==0.6.2) (1.9.173)\r\n",
      "Requirement already satisfied: requests in /opt/conda/lib/python3.6/site-packages (from pytorch-pretrained-bert==0.6.2) (2.22.0)\r\n",
      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.6/site-packages (from pytorch-pretrained-bert==0.6.2) (4.32.1)\r\n",
      "Requirement already satisfied: regex in /opt/conda/lib/python3.6/site-packages (from pytorch-pretrained-bert==0.6.2) (2019.6.8)\r\n",
      "Requirement already satisfied: s3transfer<0.3.0,>=0.2.0 in /opt/conda/lib/python3.6/site-packages (from boto3->pytorch-pretrained-bert==0.6.2) (0.2.1)\r\n",
      "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /opt/conda/lib/python3.6/site-packages (from boto3->pytorch-pretrained-bert==0.6.2) (0.9.4)\r\n",
      "Requirement already satisfied: botocore<1.13.0,>=1.12.173 in /opt/conda/lib/python3.6/site-packages (from boto3->pytorch-pretrained-bert==0.6.2) (1.12.173)\r\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.6/site-packages (from requests->pytorch-pretrained-bert==0.6.2) (2019.6.16)\r\n",
      "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /opt/conda/lib/python3.6/site-packages (from requests->pytorch-pretrained-bert==0.6.2) (3.0.4)\r\n",
      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /opt/conda/lib/python3.6/site-packages (from requests->pytorch-pretrained-bert==0.6.2) (1.24.2)\r\n",
      "Requirement already satisfied: idna<2.9,>=2.5 in /opt/conda/lib/python3.6/site-packages (from requests->pytorch-pretrained-bert==0.6.2) (2.8)\r\n",
      "Requirement already satisfied: docutils>=0.10 in /opt/conda/lib/python3.6/site-packages (from botocore<1.13.0,>=1.12.173->boto3->pytorch-pretrained-bert==0.6.2) (0.14)\r\n",
      "Requirement already satisfied: python-dateutil<3.0.0,>=2.1; python_version >= \"2.7\" in /opt/conda/lib/python3.6/site-packages (from botocore<1.13.0,>=1.12.173->boto3->pytorch-pretrained-bert==0.6.2) (2.8.0)\r\n",
      "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.6/site-packages (from python-dateutil<3.0.0,>=2.1; python_version >= \"2.7\"->botocore<1.13.0,>=1.12.173->boto3->pytorch-pretrained-bert==0.6.2) (1.12.0)\r\n",
      "Building wheels for collected packages: pytorch-pretrained-bert\r\n",
      "  Building wheel for pytorch-pretrained-bert (setup.py) ... \u001b[?25l-\b \b\\\b \bdone\r\n",
      "\u001b[?25h  Stored in directory: /tmp/.cache/pip/wheels/62/41/7f/a49d8fa715ffad54d21ed24028d5568b616321f3355a305dd1\r\n",
      "Successfully built pytorch-pretrained-bert\r\n",
      "\u001b[31mERROR: allennlp 0.8.4 requires awscli>=1.11.91, which is not installed.\u001b[0m\r\n",
      "\u001b[31mERROR: allennlp 0.8.4 requires flaky, which is not installed.\u001b[0m\r\n",
      "\u001b[31mERROR: allennlp 0.8.4 requires responses>=0.7, which is not installed.\u001b[0m\r\n",
      "Installing collected packages: pytorch-pretrained-bert\r\n",
      "  Found existing installation: pytorch-pretrained-bert 0.6.2\r\n",
      "    Uninstalling pytorch-pretrained-bert-0.6.2:\r\n",
      "      Successfully uninstalled pytorch-pretrained-bert-0.6.2\r\n",
      "Successfully installed pytorch-pretrained-bert-0.6.2\r\n"
     ]
    }
   ],
   "source": [
    "# the kernel environment don't have the last version of pytorch bert, so we need to install by ourselves.\n",
    "!pip install ../input/finalpytorchgpt/our-gpt2-pytorch/pytorch-pretrained-BERT-master/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import absolute_import\n",
    "from __future__ import division\n",
    "from __future__ import print_function\n",
    "import torch.utils.data\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm,tqdm_notebook\n",
    "import torch.nn.functional as F\n",
    "import os\n",
    "import warnings\n",
    "# from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam\n",
    "\n",
    "from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Config\n",
    "from pytorch_pretrained_bert.modeling_gpt2 import *\n",
    "warnings.filterwarnings(action='once')\n",
    "device = torch.device('cuda')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Testset Collator for triming long sequences --- put it in Pytorch DataLoader\n",
    "class TestCollator():\n",
    "    def __init__(self, sequence_index,length_index,choose_length = lambda x: x.max()):\n",
    "        self.choose_length = choose_length\n",
    "        self.sequence_index = sequence_index\n",
    "        self.length_index = length_index\n",
    "\n",
    "    def __call__(self, batch):\n",
    "        batch = [torch.stack(x) for x in zip(*batch)]\n",
    "        \n",
    "        sequences = batch[self.sequence_index]\n",
    "        lengths = batch[self.length_index]\n",
    "        length = self.choose_length(lengths)\n",
    "        padded_sequences = sequences[:, 0:length]     \n",
    "        batch[self.sequence_index] = padded_sequences\n",
    "        \n",
    "        return batch[self.sequence_index]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## GTP2 Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_lines(example):\n",
    "    '''\n",
    "    example shoulbe a data \n",
    "    '''\n",
    "    all_tokens = []\n",
    "    all_length = []\n",
    "    longer = 0\n",
    "    for text in tqdm_notebook(example):\n",
    "        # print(text)\n",
    "        tokens_a = tokenizer.tokenize(text)\n",
    "        if len(tokens_a)>MAX_SEQUENCE_LENGTH:\n",
    "            tokens_a = tokens_a[:MAX_SEQUENCE_LENGTH]\n",
    "            longer += 1\n",
    "        one_token = tokenizer.convert_tokens_to_ids(tokens_a)\n",
    "        one_token_length = len(one_token)\n",
    "        one_token += [0]*(MAX_SEQUENCE_LENGTH - len(tokens_a))\n",
    "        all_tokens.append(one_token)\n",
    "        all_length.append(one_token_length)\n",
    "    print(longer)\n",
    "    return np.array(all_tokens),np.array(all_length) # Note that including Sequence Length Here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.6/site-packages/pytorch_pretrained_bert/tokenization_gpt2.py:146: ResourceWarning: unclosed file <_io.TextIOWrapper name='../input/gpt2-models/vocab.json' mode='r' encoding='UTF-8'>\n",
      "  self.encoder = json.load(open(vocab_file))\n",
      "/opt/conda/lib/python3.6/site-packages/pytorch_pretrained_bert/tokenization_gpt2.py:151: ResourceWarning: unclosed file <_io.TextIOWrapper name='../input/gpt2-models/merges.txt' mode='r' encoding='utf-8'>\n",
      "  bpe_data = open(merges_file, encoding='utf-8').read().split('\\n')[1:-1]\n"
     ]
    }
   ],
   "source": [
    "MAX_SEQUENCE_LENGTH = 300\n",
    "SEED = 1234\n",
    "BATCH_SIZE = 128\n",
    "# BERT_MODEL_PATH = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'\n",
    "\n",
    "np.random.seed(SEED)\n",
    "torch.manual_seed(SEED)\n",
    "torch.cuda.manual_seed(SEED)\n",
    "torch.backends.cudnn.deterministic = True\n",
    "tokenizer = GPT2Tokenizer.from_pretrained('../input/gpt2-models/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "19bef29abec24c63879d4ea1caeab8d5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=97320), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "32\n"
     ]
    }
   ],
   "source": [
    "test_df = pd.read_csv(\"../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv\")\n",
    "test_df['comment_text'] = test_df['comment_text'].astype(str) \n",
    "X_test,seq_length = convert_lines(test_df[\"comment_text\"].fillna(\"DUMMY_VALUE\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## GPT2 0.9388 Prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = GPT2Config()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "class GPT2ClassificationHeadModel(GPT2PreTrainedModel):\n",
    "\n",
    "    def __init__(self, config, clf_dropout=0.4, n_class=7):\n",
    "        super(GPT2ClassificationHeadModel, self).__init__(config)\n",
    "        self.transformer = GPT2Model(config)\n",
    "        self.dropout = nn.Dropout(clf_dropout)\n",
    "        self.linear1 = nn.Linear(config.n_embd * 2, config.n_embd)\n",
    "        self.linear = nn.Linear(config.n_embd, n_class)\n",
    "        self.linear2 = nn.Linear(config.n_embd, config.n_embd)\n",
    "        self.out = nn.Linear(config.n_embd, 1)\n",
    "        # self.bn1 = nn.BatchNorm1d(config.n_embd, affine = False)\n",
    "        nn.init.normal_(self.linear.weight, std = 0.02)\n",
    "        nn.init.normal_(self.linear.bias, 0)\n",
    "        \n",
    "        self.apply(self.init_weights)\n",
    "\n",
    "\n",
    "    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):\n",
    "        hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)\n",
    "        avg_pool = torch.mean(hidden_states, 1)\n",
    "        max_pool, _ = torch.max(hidden_states, 1)\n",
    "        \n",
    "        h_conc = torch.cat((avg_pool, max_pool), 1)\n",
    "        \n",
    "        hidden = self.dropout(F.relu(self.linear1(h_conc)))\n",
    "        hidden1 = self.dropout(F.relu(self.linear2(hidden)))\n",
    "        \n",
    "        h = hidden + hidden1\n",
    "        \n",
    "        \n",
    "        output = self.out(self.dropout(h))\n",
    "        aux = self.linear(self.dropout(h))\n",
    "        \n",
    "        logits = torch.cat([output, aux], 1)\n",
    "        \n",
    "        return logits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = GPT2ClassificationHeadModel(config, 0.3,7)\n",
    "model = torch.nn.DataParallel(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['GPT2_full_0622.bin']"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "os.listdir(\"../input/gpt-dense-0622\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataParallel(\n",
       "  (module): GPT2ClassificationHeadModel(\n",
       "    (transformer): GPT2Model(\n",
       "      (wte): Embedding(50257, 768)\n",
       "      (wpe): Embedding(1024, 768)\n",
       "      (h): ModuleList(\n",
       "        (0): Block(\n",
       "          (ln_1): BertLayerNorm()\n",
       "          (attn): Attention(\n",
       "            (c_attn): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "          (ln_2): BertLayerNorm()\n",
       "          (mlp): MLP(\n",
       "            (c_fc): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "        )\n",
       "        (1): Block(\n",
       "          (ln_1): BertLayerNorm()\n",
       "          (attn): Attention(\n",
       "            (c_attn): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "          (ln_2): BertLayerNorm()\n",
       "          (mlp): MLP(\n",
       "            (c_fc): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "        )\n",
       "        (2): Block(\n",
       "          (ln_1): BertLayerNorm()\n",
       "          (attn): Attention(\n",
       "            (c_attn): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "          (ln_2): BertLayerNorm()\n",
       "          (mlp): MLP(\n",
       "            (c_fc): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "        )\n",
       "        (3): Block(\n",
       "          (ln_1): BertLayerNorm()\n",
       "          (attn): Attention(\n",
       "            (c_attn): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "          (ln_2): BertLayerNorm()\n",
       "          (mlp): MLP(\n",
       "            (c_fc): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "        )\n",
       "        (4): Block(\n",
       "          (ln_1): BertLayerNorm()\n",
       "          (attn): Attention(\n",
       "            (c_attn): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "          (ln_2): BertLayerNorm()\n",
       "          (mlp): MLP(\n",
       "            (c_fc): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "        )\n",
       "        (5): Block(\n",
       "          (ln_1): BertLayerNorm()\n",
       "          (attn): Attention(\n",
       "            (c_attn): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "          (ln_2): BertLayerNorm()\n",
       "          (mlp): MLP(\n",
       "            (c_fc): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "        )\n",
       "        (6): Block(\n",
       "          (ln_1): BertLayerNorm()\n",
       "          (attn): Attention(\n",
       "            (c_attn): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "          (ln_2): BertLayerNorm()\n",
       "          (mlp): MLP(\n",
       "            (c_fc): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "        )\n",
       "        (7): Block(\n",
       "          (ln_1): BertLayerNorm()\n",
       "          (attn): Attention(\n",
       "            (c_attn): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "          (ln_2): BertLayerNorm()\n",
       "          (mlp): MLP(\n",
       "            (c_fc): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "        )\n",
       "        (8): Block(\n",
       "          (ln_1): BertLayerNorm()\n",
       "          (attn): Attention(\n",
       "            (c_attn): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "          (ln_2): BertLayerNorm()\n",
       "          (mlp): MLP(\n",
       "            (c_fc): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "        )\n",
       "        (9): Block(\n",
       "          (ln_1): BertLayerNorm()\n",
       "          (attn): Attention(\n",
       "            (c_attn): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "          (ln_2): BertLayerNorm()\n",
       "          (mlp): MLP(\n",
       "            (c_fc): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "        )\n",
       "        (10): Block(\n",
       "          (ln_1): BertLayerNorm()\n",
       "          (attn): Attention(\n",
       "            (c_attn): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "          (ln_2): BertLayerNorm()\n",
       "          (mlp): MLP(\n",
       "            (c_fc): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "        )\n",
       "        (11): Block(\n",
       "          (ln_1): BertLayerNorm()\n",
       "          (attn): Attention(\n",
       "            (c_attn): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "          (ln_2): BertLayerNorm()\n",
       "          (mlp): MLP(\n",
       "            (c_fc): Conv1D()\n",
       "            (c_proj): Conv1D()\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "      (ln_f): BertLayerNorm()\n",
       "    )\n",
       "    (dropout): Dropout(p=0.3)\n",
       "    (linear1): Linear(in_features=1536, out_features=768, bias=True)\n",
       "    (linear): Linear(in_features=768, out_features=7, bias=True)\n",
       "    (linear2): Linear(in_features=768, out_features=768, bias=True)\n",
       "    (out): Linear(in_features=768, out_features=1, bias=True)\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.load_state_dict(torch.load(\"../input/gpt-dense-0622/GPT2_full_0622.bin\"))\n",
    "\n",
    "model.to(device)\n",
    "for param in model.parameters():\n",
    "    param.requires_grad = False\n",
    "model.eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_collator = TestCollator(0,1)\n",
    "seq_length_torch = torch.tensor(seq_length,dtype=torch.long)\n",
    "X_test_torch =torch.tensor(X_test, dtype=torch.long)\n",
    "test = torch.utils.data.TensorDataset(X_test_torch,seq_length_torch)\n",
    "test_loader = torch.utils.data.DataLoader(test, batch_size=32, shuffle=False,collate_fn=test_collator)\n",
    "test_preds = np.zeros((len(X_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9484e413bb644ce0955537b6194826e1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=3042), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "tk0 = tqdm_notebook(test_loader)\n",
    "for i, x_batch in enumerate(tk0):\n",
    "    pred = model(x_batch.to(device))\n",
    "    test_preds[i * 32:(i + 1) * 32] = pred[:, 0].detach().cpu().squeeze().numpy()\n",
    "\n",
    "test_pred = torch.sigmoid(torch.tensor(test_preds)).numpy().ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_gpt2_1 = pd.DataFrame.from_dict({\n",
    "    'id': test_df['id'],\n",
    "    'prediction': test_pred\n",
    "})\n",
    "#submission_gpt2_1.to_csv('submission.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bert Large"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
   },
   "outputs": [],
   "source": [
    "from __future__ import absolute_import\n",
    "from __future__ import division\n",
    "from __future__ import print_function\n",
    "import torch.utils.data\n",
    "from torch import nn\n",
    "from torch.nn import functional as F\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm,tqdm_notebook\n",
    "import os\n",
    "import warnings\n",
    "from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam\n",
    "from pytorch_pretrained_bert import BertConfig\n",
    "\n",
    "warnings.filterwarnings(action='once')\n",
    "device = torch.device('cuda')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add the Bart Pytorch repo to the PATH\n",
    "# using files from: https://github.com/huggingface/pytorch-pretrained-BERT\n",
    "# package_dir_a = \"../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT\"\n",
    "# sys.path.insert(0, package_dir_a)\n",
    "\n",
    "from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch\n",
    "from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification,BertAdam"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Converting the lines to BERT format\n",
    "# Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming\n",
    "def convert_lines(example, max_seq_length,tokenizer):\n",
    "    max_seq_length -=2\n",
    "    all_tokens = []\n",
    "    all_length = []\n",
    "    longer = 0\n",
    "    for text in tqdm_notebook(example):\n",
    "        tokens_a = tokenizer.tokenize(text)\n",
    "        if len(tokens_a)>max_seq_length:\n",
    "            tokens_a = tokens_a[:max_seq_length]\n",
    "            longer += 1\n",
    "        one_token = tokenizer.convert_tokens_to_ids([\"[CLS]\"]+tokens_a+[\"[SEP]\"])\n",
    "        one_token_length = len(one_token)\n",
    "        one_token = one_token + [0] * (max_seq_length - len(tokens_a))\n",
    "        all_tokens.append(one_token)\n",
    "        all_length.append(one_token_length)\n",
    "    print(longer)\n",
    "    return np.array(all_tokens),np.array(all_length) # Note that including Sequence Length Here"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert Tensorflow Weight to Pytorch Weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
    "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
   },
   "outputs": [],
   "source": [
    "MAX_SEQUENCE_LENGTH = 320 ### 320 is enough\n",
    "SEED = 1234 \n",
    "BATCH_SIZE = 64\n",
    "BERT_MODEL_PATH = '../input/bert-pretrained-model-uncased/uncased_l-24_h-1024_a-16/uncased_L-24_H-1024_A-16/'\n",
    "Data_dir=\"../input/jigsaw-unintended-bias-in-toxicity-classification\" \n",
    "Input_dir = \"../input\"  ###############\n",
    "WORK_DIR = \"../working/\" ############\n",
    "\n",
    "\n",
    "\n",
    "#bert_config = BertConfig('../input/prob-target333/bert_config.json')\n",
    "tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cd3f6d86be6b49a7ae441ca6a07446f7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=97320), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "6\n"
     ]
    }
   ],
   "source": [
    "\n",
    "test_df = pd.read_csv(\"../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv\")\n",
    "test_df['comment_text'] = test_df['comment_text'].astype(str) \n",
    "X_test,seq_length = convert_lines(test_df[\"comment_text\"].fillna(\"DUMMY_VALUE\"), MAX_SEQUENCE_LENGTH, tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Building PyTorch model from configuration: {\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 1024,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 4096,\n",
      "  \"layer_norm_eps\": 1e-12,\n",
      "  \"max_position_embeddings\": 512,\n",
      "  \"num_attention_heads\": 16,\n",
      "  \"num_hidden_layers\": 24,\n",
      "  \"type_vocab_size\": 2,\n",
      "  \"vocab_size\": 30522\n",
      "}\n",
      "\n",
      "Converting TensorFlow checkpoint from /kaggle/input/bert-pretrained-model-uncased/uncased_l-24_h-1024_a-16/uncased_L-24_H-1024_A-16/bert_model.ckpt\n",
      "Loading TF weight bert/embeddings/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/embeddings/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/embeddings/position_embeddings with shape [512, 1024]\n",
      "Loading TF weight bert/embeddings/token_type_embeddings with shape [2, 1024]\n",
      "Loading TF weight bert/embeddings/word_embeddings with shape [30522, 1024]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_0/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_0/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_0/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_0/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_0/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_0/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_1/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_1/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_1/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_1/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_1/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_1/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_10/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_10/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_10/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_10/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_10/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_10/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_11/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_11/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_11/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_11/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_11/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_11/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_12/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_12/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_12/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_12/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_12/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_12/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_12/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_12/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_12/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_12/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_12/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_12/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_12/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_12/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_12/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_12/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_13/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_13/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_13/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_13/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_13/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_13/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_13/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_13/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_13/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_13/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_13/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_13/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_13/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_13/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_13/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_13/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_14/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_14/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_14/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_14/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_14/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_14/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_14/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_14/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_14/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_14/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_14/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_14/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_14/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_14/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_14/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_14/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_15/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_15/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_15/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_15/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_15/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_15/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_15/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_15/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_15/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_15/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_15/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_15/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_15/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_15/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_15/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_15/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_16/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_16/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_16/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_16/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_16/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_16/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_16/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_16/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_16/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_16/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_16/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_16/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_16/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_16/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_16/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_16/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_17/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_17/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_17/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_17/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_17/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_17/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_17/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_17/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_17/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_17/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_17/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_17/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_17/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_17/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_17/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_17/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_18/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_18/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_18/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_18/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_18/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_18/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_18/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_18/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_18/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_18/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_18/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_18/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_18/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_18/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_18/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_18/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_19/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_19/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_19/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_19/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_19/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_19/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_19/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_19/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_19/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_19/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_19/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_19/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_19/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_19/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_19/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_19/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_2/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_2/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_2/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_2/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_2/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_2/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_20/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_20/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_20/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_20/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_20/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_20/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_20/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_20/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_20/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_20/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_20/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_20/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_20/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_20/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_20/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_20/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_21/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_21/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_21/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_21/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_21/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_21/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_21/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_21/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_21/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_21/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_21/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_21/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_21/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_21/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_21/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_21/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_22/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_22/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_22/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_22/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_22/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_22/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_22/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_22/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_22/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_22/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_22/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_22/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_22/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_22/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_22/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_22/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_23/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_23/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_23/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_23/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_23/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_23/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_23/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_23/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_23/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_23/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_23/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_23/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_23/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_23/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_23/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_23/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_3/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_3/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_3/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_3/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_3/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_3/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_4/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_4/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_4/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_4/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_4/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_4/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_5/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_5/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_5/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_5/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_5/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_5/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_6/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_6/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_6/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_6/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_6/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_6/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_7/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_7/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_7/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_7/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_7/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_7/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_8/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_8/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_8/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_8/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_8/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_8/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/key/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/key/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/query/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/query/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/value/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/value/kernel with shape [1024, 1024]\n",
      "Loading TF weight bert/encoder/layer_9/intermediate/dense/bias with shape [4096]\n",
      "Loading TF weight bert/encoder/layer_9/intermediate/dense/kernel with shape [1024, 4096]\n",
      "Loading TF weight bert/encoder/layer_9/output/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_9/output/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_9/output/dense/bias with shape [1024]\n",
      "Loading TF weight bert/encoder/layer_9/output/dense/kernel with shape [4096, 1024]\n",
      "Loading TF weight bert/pooler/dense/bias with shape [1024]\n",
      "Loading TF weight bert/pooler/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight cls/predictions/output_bias with shape [30522]\n",
      "Loading TF weight cls/predictions/transform/LayerNorm/beta with shape [1024]\n",
      "Loading TF weight cls/predictions/transform/LayerNorm/gamma with shape [1024]\n",
      "Loading TF weight cls/predictions/transform/dense/bias with shape [1024]\n",
      "Loading TF weight cls/predictions/transform/dense/kernel with shape [1024, 1024]\n",
      "Loading TF weight cls/seq_relationship/output_bias with shape [2]\n",
      "Loading TF weight cls/seq_relationship/output_weights with shape [2, 1024]\n",
      "Initialize PyTorch weight ['bert', 'embeddings', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'embeddings', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'embeddings', 'position_embeddings']\n",
      "Initialize PyTorch weight ['bert', 'embeddings', 'token_type_embeddings']\n",
      "Initialize PyTorch weight ['bert', 'embeddings', 'word_embeddings']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_12', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_13', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_14', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_15', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_16', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_17', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_18', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_19', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_20', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_21', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_22', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_23', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'pooler', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'pooler', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['cls', 'predictions', 'output_bias']\n",
      "Initialize PyTorch weight ['cls', 'predictions', 'transform', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['cls', 'predictions', 'transform', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['cls', 'predictions', 'transform', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['cls', 'predictions', 'transform', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['cls', 'seq_relationship', 'output_bias']\n",
      "Initialize PyTorch weight ['cls', 'seq_relationship', 'output_weights']\n",
      "Save PyTorch model to ../working/pytorch_model.bin\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'../working/bert_config.json'"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(\n",
    "    BERT_MODEL_PATH + 'bert_model.ckpt',\n",
    "BERT_MODEL_PATH + 'bert_config.json',\n",
    "WORK_DIR + 'pytorch_model.bin')\n",
    "\n",
    "shutil.copyfile(BERT_MODEL_PATH + 'bert_config.json', WORK_DIR + 'bert_config.json')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build up Model & Makes Prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "BERT_OUT = 128\n",
    "DENSE_HIDDEN_UNITS = 128"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "class NeuralNet(nn.Module):\n",
    "    def __init__(self):\n",
    "        super(NeuralNet, self).__init__()\n",
    "        #self.bert_layer = BertForSequenceClassification(bert_config,num_labels = BERT_OUT)\n",
    "        self.bert_layer = BertForSequenceClassification.from_pretrained(\"../working\",cache_dir=None,num_labels = BERT_OUT)\n",
    "        self.linear_out = nn.Linear(BERT_OUT, 1)\n",
    "        self.linear_aux_out = nn.Linear(BERT_OUT, 7)\n",
    "        #self.identity_out = nn.Linear(BERT_OUT, 9)\n",
    "        self.drop_out_layer = nn.Dropout(p=0.1)        \n",
    "        \n",
    "    def forward(self, x, attention_mask=None, labels=None):\n",
    "        bert_out = self.bert_layer(x, attention_mask=attention_mask, labels=labels)\n",
    "        drop_out_layer_out = self.drop_out_layer(bert_out)\n",
    "        result = self.linear_out(drop_out_layer_out)\n",
    "        aux_result = self.linear_aux_out(bert_out)\n",
    "        #identity_result = self.identity_out(bert_out)\n",
    "        out = torch.cat([result, aux_result], 1)\n",
    "        \n",
    "        return out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataParallel(\n",
       "  (module): NeuralNet(\n",
       "    (bert_layer): BertForSequenceClassification(\n",
       "      (bert): BertModel(\n",
       "        (embeddings): BertEmbeddings(\n",
       "          (word_embeddings): Embedding(30522, 1024, padding_idx=0)\n",
       "          (position_embeddings): Embedding(512, 1024)\n",
       "          (token_type_embeddings): Embedding(2, 1024)\n",
       "          (LayerNorm): BertLayerNorm()\n",
       "          (dropout): Dropout(p=0.1)\n",
       "        )\n",
       "        (encoder): BertEncoder(\n",
       "          (layer): ModuleList(\n",
       "            (0): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (1): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (2): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (3): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (4): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (5): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (6): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (7): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (8): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (9): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (10): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (11): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (12): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (13): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (14): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (15): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (16): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (17): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (18): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (19): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (20): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (21): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (22): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (23): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "          )\n",
       "        )\n",
       "        (pooler): BertPooler(\n",
       "          (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "          (activation): Tanh()\n",
       "        )\n",
       "      )\n",
       "      (dropout): Dropout(p=0.1)\n",
       "      (classifier): Linear(in_features=1024, out_features=128, bias=True)\n",
       "    )\n",
       "    (linear_out): Linear(in_features=128, out_features=1, bias=True)\n",
       "    (linear_aux_out): Linear(in_features=128, out_features=7, bias=True)\n",
       "    (drop_out_layer): Dropout(p=0.1)\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = NeuralNet()\n",
    "model = torch.nn.DataParallel(model)\n",
    "model.load_state_dict(torch.load(\"../input/bert-large/bert_large_0.bin\"))\n",
    "model.cuda()\n",
    "for param in model.parameters():\n",
    "    param.requires_grad = False\n",
    "model.eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_collator = TestCollator(0,1)\n",
    "seq_length_torch = torch.tensor(seq_length,dtype=torch.long)\n",
    "X_test_torch =torch.tensor(X_test, dtype=torch.long)\n",
    "test = torch.utils.data.TensorDataset(X_test_torch,seq_length_torch)\n",
    "test_loader = torch.utils.data.DataLoader(test, batch_size=32, shuffle=False,collate_fn=test_collator)\n",
    "test_preds = np.zeros((len(X_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e4f8beaeaf3645f6b4b8a16fa9741489",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=3042), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "tk0 = tqdm_notebook(test_loader)\n",
    "for i, x_batch in enumerate(tk0):\n",
    "    pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None)\n",
    "    test_preds[i * 32:(i + 1) * 32] = pred[:, 0].detach().cpu().squeeze().numpy()\n",
    "\n",
    "test_pred = torch.sigmoid(torch.tensor(test_preds)).numpy().ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_bert_large = pd.DataFrame.from_dict({\n",
    "    'id': test_df['id'],\n",
    "    'prediction': test_pred\n",
    "})\n",
    "#submission_bert_large.to_csv('submission_bert_large.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>prediction</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7097320</td>\n",
       "      <td>0.000141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7097321</td>\n",
       "      <td>0.016696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7097322</td>\n",
       "      <td>0.061666</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7097323</td>\n",
       "      <td>0.003367</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7097324</td>\n",
       "      <td>0.001364</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        id  prediction\n",
       "0  7097320    0.000141\n",
       "1  7097321    0.016696\n",
       "2  7097322    0.061666\n",
       "3  7097323    0.003367\n",
       "4  7097324    0.001364"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "submission_bert_large.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bert Base\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokenization For Bert"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Converting the lines to BERT format\n",
    "# Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming\n",
    "def convert_lines(example, max_seq_length,tokenizer):\n",
    "    max_seq_length -=2\n",
    "    all_tokens = []\n",
    "    all_length = []\n",
    "    longer = 0\n",
    "    for text in tqdm_notebook(example):\n",
    "        tokens_a = tokenizer.tokenize(text)\n",
    "        if len(tokens_a)>max_seq_length:\n",
    "            tokens_a = tokens_a[:max_seq_length]\n",
    "            longer += 1\n",
    "        one_token = tokenizer.convert_tokens_to_ids([\"[CLS]\"]+tokens_a+[\"[SEP]\"])\n",
    "        one_token_length = len(one_token)\n",
    "        one_token = one_token + [0] * (max_seq_length - len(tokens_a))\n",
    "        all_tokens.append(one_token)\n",
    "        all_length.append(one_token_length)\n",
    "    print(longer)\n",
    "    return np.array(all_tokens),np.array(all_length) # Note that including Sequence Length Here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
    "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
   },
   "outputs": [],
   "source": [
    "MAX_SEQUENCE_LENGTH = 320 ### 320 is enough\n",
    "SEED = 1234 \n",
    "BATCH_SIZE = 64\n",
    "BERT_MODEL_PATH = '../input/bert-pretrained-model-uncased/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'\n",
    "Data_dir=\"../input/jigsaw-unintended-bias-in-toxicity-classification\" \n",
    "Input_dir = \"../input\"  ###############\n",
    "WORK_DIR = \"../working/\" ############\n",
    "\n",
    "\n",
    "\n",
    "#bert_config = BertConfig('../input/prob-target333/bert_config.json')\n",
    "tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert Tensorflow Weight to Pytorch Weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Building PyTorch model from configuration: {\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-12,\n",
      "  \"max_position_embeddings\": 512,\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"type_vocab_size\": 2,\n",
      "  \"vocab_size\": 30522\n",
      "}\n",
      "\n",
      "Converting TensorFlow checkpoint from /kaggle/input/bert-pretrained-model-uncased/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/bert_model.ckpt\n",
      "Loading TF weight bert/embeddings/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/embeddings/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/embeddings/position_embeddings with shape [512, 768]\n",
      "Loading TF weight bert/embeddings/token_type_embeddings with shape [2, 768]\n",
      "Loading TF weight bert/embeddings/word_embeddings with shape [30522, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_0/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_0/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_0/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_0/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_1/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_1/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_1/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_1/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_10/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_10/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_10/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_10/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_11/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_11/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_11/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_11/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_2/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_2/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_2/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_2/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_3/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_3/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_3/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_3/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_4/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_4/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_4/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_4/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_5/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_5/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_5/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_5/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_6/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_6/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_6/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_6/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_7/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_7/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_7/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_7/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_8/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_8/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_8/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_8/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/output/dense/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/key/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/key/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/query/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/query/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/value/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/attention/self/value/kernel with shape [768, 768]\n",
      "Loading TF weight bert/encoder/layer_9/intermediate/dense/bias with shape [3072]\n",
      "Loading TF weight bert/encoder/layer_9/intermediate/dense/kernel with shape [768, 3072]\n",
      "Loading TF weight bert/encoder/layer_9/output/LayerNorm/beta with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/output/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/output/dense/bias with shape [768]\n",
      "Loading TF weight bert/encoder/layer_9/output/dense/kernel with shape [3072, 768]\n",
      "Loading TF weight bert/pooler/dense/bias with shape [768]\n",
      "Loading TF weight bert/pooler/dense/kernel with shape [768, 768]\n",
      "Loading TF weight cls/predictions/output_bias with shape [30522]\n",
      "Loading TF weight cls/predictions/transform/LayerNorm/beta with shape [768]\n",
      "Loading TF weight cls/predictions/transform/LayerNorm/gamma with shape [768]\n",
      "Loading TF weight cls/predictions/transform/dense/bias with shape [768]\n",
      "Loading TF weight cls/predictions/transform/dense/kernel with shape [768, 768]\n",
      "Loading TF weight cls/seq_relationship/output_bias with shape [2]\n",
      "Loading TF weight cls/seq_relationship/output_weights with shape [2, 768]\n",
      "Initialize PyTorch weight ['bert', 'embeddings', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'embeddings', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'embeddings', 'position_embeddings']\n",
      "Initialize PyTorch weight ['bert', 'embeddings', 'token_type_embeddings']\n",
      "Initialize PyTorch weight ['bert', 'embeddings', 'word_embeddings']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_0', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_1', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_10', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_11', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_2', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_3', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_4', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_5', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_6', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_7', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_8', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'key', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'key', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'query', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'query', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'value', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'attention', 'self', 'value', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'intermediate', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'intermediate', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'output', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'output', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'output', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'encoder', 'layer_9', 'output', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['bert', 'pooler', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['bert', 'pooler', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['cls', 'predictions', 'output_bias']\n",
      "Initialize PyTorch weight ['cls', 'predictions', 'transform', 'LayerNorm', 'beta']\n",
      "Initialize PyTorch weight ['cls', 'predictions', 'transform', 'LayerNorm', 'gamma']\n",
      "Initialize PyTorch weight ['cls', 'predictions', 'transform', 'dense', 'bias']\n",
      "Initialize PyTorch weight ['cls', 'predictions', 'transform', 'dense', 'kernel']\n",
      "Initialize PyTorch weight ['cls', 'seq_relationship', 'output_bias']\n",
      "Initialize PyTorch weight ['cls', 'seq_relationship', 'output_weights']\n",
      "Save PyTorch model to ../working/pytorch_model.bin\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'../working/bert_config.json'"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(\n",
    "    BERT_MODEL_PATH + 'bert_model.ckpt',\n",
    "BERT_MODEL_PATH + 'bert_config.json',\n",
    "WORK_DIR + 'pytorch_model.bin')\n",
    "\n",
    "shutil.copyfile(BERT_MODEL_PATH + 'bert_config.json', WORK_DIR + 'bert_config.json')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BertModel_1 0.94135LB 0.94185CV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "BERT_OUT = 128\n",
    "DENSE_HIDDEN_UNITS = 128"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "class NeuralNet(nn.Module):\n",
    "    def __init__(self):\n",
    "        super(NeuralNet, self).__init__()\n",
    "        #self.bert_layer = BertForSequenceClassification(bert_config,num_labels = BERT_OUT)\n",
    "        self.bert_layer = BertForSequenceClassification.from_pretrained(\"../working\",cache_dir=None,num_labels = BERT_OUT)\n",
    "        self.linear_out = nn.Linear(BERT_OUT, 1)\n",
    "        self.linear_aux_out = nn.Linear(BERT_OUT, 8)\n",
    "        #self.identity_out = nn.Linear(BERT_OUT, 9)\n",
    "        self.drop_out_layer = nn.Dropout(p=0.1)        \n",
    "        \n",
    "    def forward(self, x, attention_mask=None, labels=None):\n",
    "        bert_out = self.bert_layer(x, attention_mask=attention_mask, labels=labels)\n",
    "        drop_out_layer_out = self.drop_out_layer(bert_out)\n",
    "        result = self.linear_out(drop_out_layer_out)\n",
    "        aux_result = self.linear_aux_out(bert_out)\n",
    "        #identity_result = self.identity_out(bert_out)\n",
    "        out = torch.cat([result, aux_result], 1)\n",
    "        \n",
    "        return out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataParallel(\n",
       "  (module): NeuralNet(\n",
       "    (bert_layer): BertForSequenceClassification(\n",
       "      (bert): BertModel(\n",
       "        (embeddings): BertEmbeddings(\n",
       "          (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
       "          (position_embeddings): Embedding(512, 768)\n",
       "          (token_type_embeddings): Embedding(2, 768)\n",
       "          (LayerNorm): BertLayerNorm()\n",
       "          (dropout): Dropout(p=0.1)\n",
       "        )\n",
       "        (encoder): BertEncoder(\n",
       "          (layer): ModuleList(\n",
       "            (0): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (1): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (2): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (3): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (4): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (5): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (6): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (7): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (8): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (9): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (10): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (11): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "          )\n",
       "        )\n",
       "        (pooler): BertPooler(\n",
       "          (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "          (activation): Tanh()\n",
       "        )\n",
       "      )\n",
       "      (dropout): Dropout(p=0.1)\n",
       "      (classifier): Linear(in_features=768, out_features=128, bias=True)\n",
       "    )\n",
       "    (linear_out): Linear(in_features=128, out_features=1, bias=True)\n",
       "    (linear_aux_out): Linear(in_features=128, out_features=8, bias=True)\n",
       "    (drop_out_layer): Dropout(p=0.1)\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = NeuralNet()\n",
    "model = torch.nn.DataParallel(model)\n",
    "model.load_state_dict(torch.load(\"../input/prob-target333/batch128_probtarget2_1.bin\"))\n",
    "model.cuda()\n",
    "for param in model.parameters():\n",
    "    param.requires_grad = False\n",
    "model.eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_collator = TestCollator(0,1)\n",
    "seq_length_torch = torch.tensor(seq_length,dtype=torch.long)\n",
    "X_test_torch =torch.tensor(X_test, dtype=torch.long)\n",
    "test = torch.utils.data.TensorDataset(X_test_torch,seq_length_torch)\n",
    "test_loader = torch.utils.data.DataLoader(test, batch_size=32, shuffle=False,collate_fn=test_collator)\n",
    "test_preds = np.zeros((len(X_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3201ee947a354f468f703c8afcb8fb5a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=3042), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "tk0 = tqdm_notebook(test_loader)\n",
    "for i, x_batch in enumerate(tk0):\n",
    "    pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None)\n",
    "    test_preds[i * 32:(i + 1) * 32] = pred[:, 0].detach().cpu().squeeze().numpy()\n",
    "\n",
    "test_pred = torch.sigmoid(torch.tensor(test_preds)).numpy().ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_bert_1 = pd.DataFrame.from_dict({\n",
    "    'id': test_df['id'],\n",
    "    'prediction': test_pred\n",
    "})\n",
    "#submission_bert_1.to_csv('submission.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>prediction</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7097320</td>\n",
       "      <td>0.000094</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7097321</td>\n",
       "      <td>0.007118</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7097322</td>\n",
       "      <td>0.070092</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7097323</td>\n",
       "      <td>0.000373</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7097324</td>\n",
       "      <td>0.000112</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        id  prediction\n",
       "0  7097320    0.000094\n",
       "1  7097321    0.007118\n",
       "2  7097322    0.070092\n",
       "3  7097323    0.000373\n",
       "4  7097324    0.000112"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "submission_bert_1.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BertModel_2 0.94154LB 0.94184CV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "BERT_OUT = 128\n",
    "DENSE_HIDDEN_UNITS = 128"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "class NeuralNet(nn.Module):\n",
    "    def __init__(self):\n",
    "        super(NeuralNet, self).__init__()\n",
    "        #self.bert_layer = BertForSequenceClassification(config=bert_config,num_labels = BERT_OUT)\n",
    "        self.bert_layer = BertForSequenceClassification.from_pretrained(\"../working\",cache_dir=None,num_labels = BERT_OUT)\n",
    "        self.linear_out = nn.Linear(BERT_OUT, 1)\n",
    "        self.linear_aux_out = nn.Linear(BERT_OUT, 6)\n",
    "        self.drop_out_layer = nn.Dropout(p=0.1)        \n",
    "        \n",
    "    def forward(self, x, attention_mask=None, labels=None):\n",
    "        bert_out = self.bert_layer(x, attention_mask=attention_mask, labels=labels)\n",
    "        drop_out_layer_out = self.drop_out_layer(bert_out)\n",
    "        result = self.linear_out(drop_out_layer_out)\n",
    "        aux_result = self.linear_aux_out(bert_out)\n",
    "        out = torch.cat([result, aux_result], 1)\n",
    "        \n",
    "        return out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataParallel(\n",
       "  (module): NeuralNet(\n",
       "    (bert_layer): BertForSequenceClassification(\n",
       "      (bert): BertModel(\n",
       "        (embeddings): BertEmbeddings(\n",
       "          (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
       "          (position_embeddings): Embedding(512, 768)\n",
       "          (token_type_embeddings): Embedding(2, 768)\n",
       "          (LayerNorm): BertLayerNorm()\n",
       "          (dropout): Dropout(p=0.1)\n",
       "        )\n",
       "        (encoder): BertEncoder(\n",
       "          (layer): ModuleList(\n",
       "            (0): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (1): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (2): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (3): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (4): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (5): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (6): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (7): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (8): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (9): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (10): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "            (11): BertLayer(\n",
       "              (attention): BertAttention(\n",
       "                (self): BertSelfAttention(\n",
       "                  (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "                (output): BertSelfOutput(\n",
       "                  (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "                  (LayerNorm): BertLayerNorm()\n",
       "                  (dropout): Dropout(p=0.1)\n",
       "                )\n",
       "              )\n",
       "              (intermediate): BertIntermediate(\n",
       "                (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "              )\n",
       "              (output): BertOutput(\n",
       "                (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "                (LayerNorm): BertLayerNorm()\n",
       "                (dropout): Dropout(p=0.1)\n",
       "              )\n",
       "            )\n",
       "          )\n",
       "        )\n",
       "        (pooler): BertPooler(\n",
       "          (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "          (activation): Tanh()\n",
       "        )\n",
       "      )\n",
       "      (dropout): Dropout(p=0.1)\n",
       "      (classifier): Linear(in_features=768, out_features=128, bias=True)\n",
       "    )\n",
       "    (linear_out): Linear(in_features=128, out_features=1, bias=True)\n",
       "    (linear_aux_out): Linear(in_features=128, out_features=6, bias=True)\n",
       "    (drop_out_layer): Dropout(p=0.1)\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = NeuralNet()\n",
    "model = torch.nn.DataParallel(model)\n",
    "model.load_state_dict(torch.load(\"../input/dense-128-260seq-reweightd-dropout1/dense_128_260seq_reweightd_dropout1_1.bin\"))\n",
    "model.cuda()\n",
    "for param in model.parameters():\n",
    "    param.requires_grad = False\n",
    "model.eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_collator = TestCollator(0,1)\n",
    "seq_length_torch = torch.tensor(seq_length,dtype=torch.long)\n",
    "X_test_torch =torch.tensor(X_test, dtype=torch.long)\n",
    "test = torch.utils.data.TensorDataset(X_test_torch,seq_length_torch)\n",
    "test_loader = torch.utils.data.DataLoader(test, batch_size=32, shuffle=False,collate_fn=test_collator)\n",
    "test_preds = np.zeros((len(X_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a36e36a31825410eb3e06d1412af3edf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=3042), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "tk0 = tqdm_notebook(test_loader)\n",
    "for i, x_batch in enumerate(tk0):\n",
    "    pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None)\n",
    "    test_preds[i * 32:(i + 1) * 32] = pred[:, 0].detach().cpu().squeeze().numpy()\n",
    "\n",
    "test_pred = torch.sigmoid(torch.tensor(test_preds)).numpy().ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>prediction</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7097320</td>\n",
       "      <td>0.000146</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7097321</td>\n",
       "      <td>0.002046</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7097322</td>\n",
       "      <td>0.164108</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7097323</td>\n",
       "      <td>0.000535</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7097324</td>\n",
       "      <td>0.000175</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        id  prediction\n",
       "0  7097320    0.000146\n",
       "1  7097321    0.002046\n",
       "2  7097322    0.164108\n",
       "3  7097323    0.000535\n",
       "4  7097324    0.000175"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "submission_bert_2 = pd.DataFrame.from_dict({\n",
    "    'id': test_df['id'],\n",
    "    'prediction': test_pred\n",
    "})\n",
    "#submission_bert_2.to_csv('submission.csv', index=False)\n",
    "submission_bert_2.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Blending"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred = \\\n",
    "0.130*submission_lstm.prediction+\\\n",
    "0.350*submission_bert_large.prediction +\\\n",
    "0.175*submission_bert_1.prediction + 0.175*submission_bert_2.prediction + \\\n",
    "0.170*submission_gpt2_1.prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_blending = pd.DataFrame({\n",
    "    \"id\":submission_bert_1.id,\n",
    "    \"prediction\":pred\n",
    "})\n",
    "submission_blending.to_csv(\"submission.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>prediction</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7097320</td>\n",
       "      <td>0.000195</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7097321</td>\n",
       "      <td>0.010158</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7097322</td>\n",
       "      <td>0.112465</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7097323</td>\n",
       "      <td>0.002406</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7097324</td>\n",
       "      <td>0.000555</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>7097325</td>\n",
       "      <td>0.000076</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7097326</td>\n",
       "      <td>0.995423</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>7097327</td>\n",
       "      <td>0.327779</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>7097328</td>\n",
       "      <td>0.000142</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>7097329</td>\n",
       "      <td>0.002315</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>7097330</td>\n",
       "      <td>0.001165</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>7097331</td>\n",
       "      <td>0.228012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>7097332</td>\n",
       "      <td>0.023661</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>7097333</td>\n",
       "      <td>0.001755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>7097334</td>\n",
       "      <td>0.110021</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>7097335</td>\n",
       "      <td>0.001391</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>7097336</td>\n",
       "      <td>0.001610</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>7097337</td>\n",
       "      <td>0.000084</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>7097338</td>\n",
       "      <td>0.000144</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>7097339</td>\n",
       "      <td>0.878871</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>7097340</td>\n",
       "      <td>0.000798</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>7097341</td>\n",
       "      <td>0.053250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>7097342</td>\n",
       "      <td>0.000564</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>7097343</td>\n",
       "      <td>0.000088</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>7097344</td>\n",
       "      <td>0.001508</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         id  prediction\n",
       "0   7097320    0.000195\n",
       "1   7097321    0.010158\n",
       "2   7097322    0.112465\n",
       "3   7097323    0.002406\n",
       "4   7097324    0.000555\n",
       "5   7097325    0.000076\n",
       "6   7097326    0.995423\n",
       "7   7097327    0.327779\n",
       "8   7097328    0.000142\n",
       "9   7097329    0.002315\n",
       "10  7097330    0.001165\n",
       "11  7097331    0.228012\n",
       "12  7097332    0.023661\n",
       "13  7097333    0.001755\n",
       "14  7097334    0.110021\n",
       "15  7097335    0.001391\n",
       "16  7097336    0.001610\n",
       "17  7097337    0.000084\n",
       "18  7097338    0.000144\n",
       "19  7097339    0.878871\n",
       "20  7097340    0.000798\n",
       "21  7097341    0.053250\n",
       "22  7097342    0.000564\n",
       "23  7097343    0.000088\n",
       "24  7097344    0.001508"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "submission_blending.head(25)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {
     "1717d736715b45a6baa3564ba01e0d39": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "18ff373b829549219a1ed6b67de2b088": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "HTMLModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "HTMLModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "HTMLView",
       "description": "",
       "description_tooltip": null,
       "layout": "IPY_MODEL_35244236b7b743c58586f3da7ac702ff",
       "placeholder": "​",
       "style": "IPY_MODEL_f5dd1f13b25b48f9aa1b665cc88e0b02",
       "value": "100% 97320/97320 [00:33&lt;00:00, 2921.84it/s]"
      }
     },
     "192bb5e0b5294499940cf4cfc91401b5": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "19962dce3706461ea2d287a15bdacb59": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "DescriptionStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "StyleView",
       "description_width": ""
      }
     },
     "19bef29abec24c63879d4ea1caeab8d5": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "HBoxModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "HBoxModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "HBoxView",
       "box_style": "",
       "children": [
        "IPY_MODEL_49f721344a004f76a64a642dd5c59f86",
        "IPY_MODEL_18ff373b829549219a1ed6b67de2b088"
       ],
       "layout": "IPY_MODEL_46bbad7fb15d4597adab182b2ce9e45b"
      }
     },
     "1e08a3288638431099f7c69890b23487": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "2be6e19730864badbf652f1456572bca": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "IntProgressModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "IntProgressModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "ProgressView",
       "bar_style": "success",
       "description": "",
       "description_tooltip": null,
       "layout": "IPY_MODEL_6bd890786670406ca88601df1de63977",
       "max": 3042,
       "min": 0,
       "orientation": "horizontal",
       "style": "IPY_MODEL_bf648f80cd134750a390d51fb03e09b2",
       "value": 3042
      }
     },
     "2fd496c26bae49f7bea3d946a3806361": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "318f19a1e9994e2ba072628c226f8335": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "IntProgressModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "IntProgressModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "ProgressView",
       "bar_style": "success",
       "description": "",
       "description_tooltip": null,
       "layout": "IPY_MODEL_ae78d6a90f3a40a698420fc0bce7ce34",
       "max": 3042,
       "min": 0,
       "orientation": "horizontal",
       "style": "IPY_MODEL_9ffda53c1cf147eeb1cac00245068bb1",
       "value": 3042
      }
     },
     "3201ee947a354f468f703c8afcb8fb5a": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "HBoxModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "HBoxModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "HBoxView",
       "box_style": "",
       "children": [
        "IPY_MODEL_4571b10c26aa4aee8a91d5182b02efbe",
        "IPY_MODEL_797bc43b4fce4b8b9182c9d51d0d0091"
       ],
       "layout": "IPY_MODEL_a800a3732b964a8796c3da36676f13cb"
      }
     },
     "35244236b7b743c58586f3da7ac702ff": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "35775a4d0fac44e5a12cefd9df81ab43": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "DescriptionStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "StyleView",
       "description_width": ""
      }
     },
     "425d3b8dcddb46e080e128abb5237901": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "IntProgressModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "IntProgressModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "ProgressView",
       "bar_style": "success",
       "description": "",
       "description_tooltip": null,
       "layout": "IPY_MODEL_f89b683b58674f62a5e8affd1aa06616",
       "max": 97320,
       "min": 0,
       "orientation": "horizontal",
       "style": "IPY_MODEL_e8c6dfad256b450ebcba081db377f680",
       "value": 97320
      }
     },
     "43dea50dc145427db9841f0346a0f082": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "4571b10c26aa4aee8a91d5182b02efbe": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "IntProgressModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "IntProgressModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "ProgressView",
       "bar_style": "success",
       "description": "",
       "description_tooltip": null,
       "layout": "IPY_MODEL_f0d5c58dcee7472898dc4fd27e049871",
       "max": 3042,
       "min": 0,
       "orientation": "horizontal",
       "style": "IPY_MODEL_8f554b50b3d64771923f39a38a094742",
       "value": 3042
      }
     },
     "46bbad7fb15d4597adab182b2ce9e45b": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "49f721344a004f76a64a642dd5c59f86": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "IntProgressModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "IntProgressModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "ProgressView",
       "bar_style": "success",
       "description": "",
       "description_tooltip": null,
       "layout": "IPY_MODEL_1e08a3288638431099f7c69890b23487",
       "max": 97320,
       "min": 0,
       "orientation": "horizontal",
       "style": "IPY_MODEL_96d3cdbda5ba4272b2d0d7442181d0dc",
       "value": 97320
      }
     },
     "5711aa9fec30456e9c2f6d462f5fb782": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "695eabd218ef4dac8a74771a3a9a91b9": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "6bd890786670406ca88601df1de63977": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "7053f32599a541a1aaf2863b6504092b": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "ProgressStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "StyleView",
       "bar_color": null,
       "description_width": ""
      }
     },
     "797bc43b4fce4b8b9182c9d51d0d0091": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "HTMLModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "HTMLModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "HTMLView",
       "description": "",
       "description_tooltip": null,
       "layout": "IPY_MODEL_5711aa9fec30456e9c2f6d462f5fb782",
       "placeholder": "​",
       "style": "IPY_MODEL_acca09c922dc49d38552f0d6047b711a",
       "value": "100% 3042/3042 [12:16&lt;00:00,  4.13it/s]"
      }
     },
     "7f71f7f837d644f5aaf287e42431e712": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "HTMLModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "HTMLModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "HTMLView",
       "description": "",
       "description_tooltip": null,
       "layout": "IPY_MODEL_c0a4453fbc37448eb036f6470f63df12",
       "placeholder": "​",
       "style": "IPY_MODEL_fd406caeff394cf48961c063b2372255",
       "value": "100% 3042/3042 [38:18&lt;00:00,  1.60it/s]"
      }
     },
     "8f554b50b3d64771923f39a38a094742": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "ProgressStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "StyleView",
       "bar_color": null,
       "description_width": ""
      }
     },
     "9484e413bb644ce0955537b6194826e1": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "HBoxModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "HBoxModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "HBoxView",
       "box_style": "",
       "children": [
        "IPY_MODEL_2be6e19730864badbf652f1456572bca",
        "IPY_MODEL_a00b367361af45be83bc981686523cb3"
       ],
       "layout": "IPY_MODEL_b415ad1179e94547aec9f709967e2efe"
      }
     },
     "96d3cdbda5ba4272b2d0d7442181d0dc": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "ProgressStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "StyleView",
       "bar_color": null,
       "description_width": ""
      }
     },
     "9ffda53c1cf147eeb1cac00245068bb1": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "ProgressStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "StyleView",
       "bar_color": null,
       "description_width": ""
      }
     },
     "a00b367361af45be83bc981686523cb3": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "HTMLModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "HTMLModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "HTMLView",
       "description": "",
       "description_tooltip": null,
       "layout": "IPY_MODEL_1717d736715b45a6baa3564ba01e0d39",
       "placeholder": "​",
       "style": "IPY_MODEL_35775a4d0fac44e5a12cefd9df81ab43",
       "value": "100% 3042/3042 [13:30&lt;00:00,  3.75it/s]"
      }
     },
     "a304365cbfc44a69a5e38824d3caf1fc": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "a36e36a31825410eb3e06d1412af3edf": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "HBoxModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "HBoxModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "HBoxView",
       "box_style": "",
       "children": [
        "IPY_MODEL_bf63dcbfc6d842bdb568ea6c088b3dc2",
        "IPY_MODEL_b04279936ccd49a48d64c77e4f7297bf"
       ],
       "layout": "IPY_MODEL_192bb5e0b5294499940cf4cfc91401b5"
      }
     },
     "a800a3732b964a8796c3da36676f13cb": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "acca09c922dc49d38552f0d6047b711a": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "DescriptionStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "StyleView",
       "description_width": ""
      }
     },
     "ae78d6a90f3a40a698420fc0bce7ce34": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "b04279936ccd49a48d64c77e4f7297bf": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "HTMLModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "HTMLModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "HTMLView",
       "description": "",
       "description_tooltip": null,
       "layout": "IPY_MODEL_43dea50dc145427db9841f0346a0f082",
       "placeholder": "​",
       "style": "IPY_MODEL_fe4e2959e8404721a4dff8c6c6c4cfc0",
       "value": "100% 3042/3042 [12:16&lt;00:00,  4.13it/s]"
      }
     },
     "b415ad1179e94547aec9f709967e2efe": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "bf63dcbfc6d842bdb568ea6c088b3dc2": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "IntProgressModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "IntProgressModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "ProgressView",
       "bar_style": "success",
       "description": "",
       "description_tooltip": null,
       "layout": "IPY_MODEL_695eabd218ef4dac8a74771a3a9a91b9",
       "max": 3042,
       "min": 0,
       "orientation": "horizontal",
       "style": "IPY_MODEL_7053f32599a541a1aaf2863b6504092b",
       "value": 3042
      }
     },
     "bf648f80cd134750a390d51fb03e09b2": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "ProgressStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "StyleView",
       "bar_color": null,
       "description_width": ""
      }
     },
     "c0a4453fbc37448eb036f6470f63df12": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "cd3f6d86be6b49a7ae441ca6a07446f7": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "HBoxModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "HBoxModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "HBoxView",
       "box_style": "",
       "children": [
        "IPY_MODEL_425d3b8dcddb46e080e128abb5237901",
        "IPY_MODEL_d0dae2b4a91644e782cdf682d69234b5"
       ],
       "layout": "IPY_MODEL_2fd496c26bae49f7bea3d946a3806361"
      }
     },
     "d0dae2b4a91644e782cdf682d69234b5": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "HTMLModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "HTMLModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "HTMLView",
       "description": "",
       "description_tooltip": null,
       "layout": "IPY_MODEL_e00ca8c26830476281a7590a5b169489",
       "placeholder": "​",
       "style": "IPY_MODEL_19962dce3706461ea2d287a15bdacb59",
       "value": "100% 97320/97320 [01:49&lt;00:00, 891.92it/s]"
      }
     },
     "e00ca8c26830476281a7590a5b169489": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "e4f8beaeaf3645f6b4b8a16fa9741489": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "HBoxModel",
      "state": {
       "_dom_classes": [],
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "HBoxModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/controls",
       "_view_module_version": "1.4.0",
       "_view_name": "HBoxView",
       "box_style": "",
       "children": [
        "IPY_MODEL_318f19a1e9994e2ba072628c226f8335",
        "IPY_MODEL_7f71f7f837d644f5aaf287e42431e712"
       ],
       "layout": "IPY_MODEL_a304365cbfc44a69a5e38824d3caf1fc"
      }
     },
     "e8c6dfad256b450ebcba081db377f680": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "ProgressStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "StyleView",
       "bar_color": null,
       "description_width": ""
      }
     },
     "f0d5c58dcee7472898dc4fd27e049871": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "f5dd1f13b25b48f9aa1b665cc88e0b02": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "DescriptionStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "StyleView",
       "description_width": ""
      }
     },
     "f89b683b58674f62a5e8affd1aa06616": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.1.0",
      "model_name": "LayoutModel",
      "state": {
       "_model_module": "@jupyter-widgets/base",
       "_model_module_version": "1.1.0",
       "_model_name": "LayoutModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "LayoutView",
       "align_content": null,
       "align_items": null,
       "align_self": null,
       "border": null,
       "bottom": null,
       "display": null,
       "flex": null,
       "flex_flow": null,
       "grid_area": null,
       "grid_auto_columns": null,
       "grid_auto_flow": null,
       "grid_auto_rows": null,
       "grid_column": null,
       "grid_gap": null,
       "grid_row": null,
       "grid_template_areas": null,
       "grid_template_columns": null,
       "grid_template_rows": null,
       "height": null,
       "justify_content": null,
       "left": null,
       "margin": null,
       "max_height": null,
       "max_width": null,
       "min_height": null,
       "min_width": null,
       "order": null,
       "overflow": null,
       "overflow_x": null,
       "overflow_y": null,
       "padding": null,
       "right": null,
       "top": null,
       "visibility": null,
       "width": null
      }
     },
     "fd406caeff394cf48961c063b2372255": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "DescriptionStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "StyleView",
       "description_width": ""
      }
     },
     "fe4e2959e8404721a4dff8c6c6c4cfc0": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.4.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "_model_module": "@jupyter-widgets/controls",
       "_model_module_version": "1.4.0",
       "_model_name": "DescriptionStyleModel",
       "_view_count": null,
       "_view_module": "@jupyter-widgets/base",
       "_view_module_version": "1.1.0",
       "_view_name": "StyleView",
       "description_width": ""
      }
     }
    },
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
